
# coding: utf-8

# In[16]:


from urllib import request,parse
import json
import time
import random
import hashlib
import urllib
import re
import os
os.chdir("E:\pythonstudy")
#page = int(input())
# url = "http://sz.58.com/chuzu/pn"+str(page)+"/?ClickID=1"
# print(url)
#url = "http://maoyan.com/board/4?offset="+str(page)


# In[24]:


#提取序号、图片、主演、时间，其他几项实在太卡了，运行不出来
def maoyan(url):
    res = urllib.request.urlopen(url)
    html = res.read().decode("utf-8")
    pat = '<dd>\s+<i class=".*?">(.*?)</i>.*?<img src="(.*?)" alt="" class="poster-default" />.*?<p class="star">\s+(.*?)\s+</p>\s+<p class="releasetime">上映时间：(.*?)</p>\s+</div>\s+<div class="movie-item-number scor'
    dist = re.findall(pat,html,re.S) 
    return dist


# In[18]:


#根据页面变化规律，遍历前面10页
h=[]
for i in range(0,100,10):
    url = "http://maoyan.com/board/4?offset="+str(i)
    h = h+maoyan(url)


# In[23]:


#写入文件中
with open("maoyan.txt", 'w',encoding="utf-8") as f:
    for i in h:
        f.write(str(i))
        f.write("\n")

